From 6289f3c48ca8dde511d227411e16b731d3c05328 Mon Sep 17 00:00:00 2001 From: Nik Everett Date: Fri, 18 Apr 2014 09:16:04 -0400 Subject: [PATCH] Make HtmlFormatter return removed elements This shouldn't cause much overhead and is useful for getting the text of the removed tags. Change-Id: I97cf66014719244b8bb2b0509b419c82202bdb01 --- includes/HtmlFormatter.php | 22 +++++++++++------- tests/phpunit/includes/HtmlFormatterTest.php | 24 ++++++++++++++++---- 2 files changed, 34 insertions(+), 12 deletions(-) diff --git a/includes/HtmlFormatter.php b/includes/HtmlFormatter.php index 7f590e5207..96ffe9ea5c 100644 --- a/includes/HtmlFormatter.php +++ b/includes/HtmlFormatter.php @@ -128,7 +128,9 @@ class HtmlFormatter { } /** - * Removes content we've chosen to remove + * Removes content we've chosen to remove. The text of the removed elements can be + * extracted with the getText method. + * @return array of removed DOMElements */ public function filterContent() { wfProfileIn( __METHOD__ ); @@ -156,8 +158,7 @@ class HtmlFormatter { } } } - - $this->removeElements( $domElemsToRemove ); + $removed = $this->removeElements( $domElemsToRemove ); // Elements with named IDs $domElemsToRemove = array(); @@ -167,7 +168,7 @@ class HtmlFormatter { $domElemsToRemove[] = $itemToRemoveNode; } } - $this->removeElements( $domElemsToRemove ); + $removed = array_merge( $removed, $this->removeElements( $domElemsToRemove ) ); // CSS Classes $domElemsToRemove = array(); @@ -183,7 +184,7 @@ class HtmlFormatter { } } } - $this->removeElements( $domElemsToRemove ); + $removed = array_merge( $removed, $this->removeElements( $domElemsToRemove ) ); // Tags with CSS Classes foreach ( $removals['TAG_CLASS'] as $classToRemove ) { @@ -192,16 +193,17 @@ class HtmlFormatter { $elements = $xpath->query( '//' . $parts[0] . '[@class="' . $parts[1] . '"]' ); - - $this->removeElements( $elements ); + $removed = array_merge( $removed, $this->removeElements( $elements ) ); } wfProfileOut( __METHOD__ ); + return $removed; } /** * Removes a list of elelments from DOMDocument * @param array|DOMNodeList $elements + * @return array of removed elements */ private function removeElements( $elements ) { $list = $elements; @@ -217,6 +219,7 @@ class HtmlFormatter { $element->parentNode->removeChild( $element ); } } + return $list; } /** @@ -245,7 +248,10 @@ class HtmlFormatter { } /** - * Performs final transformations and returns resulting HTML + * Performs final transformations and returns resulting HTML. Note that if you want to call this + * both without an element and with an element you should call it without an element first. If you + * specify the $element in the method it'll change the underlying dom and you won't be able to get + * it back. * * @param DOMElement|string|null $element ID of element to get HTML from or false to get it from the whole tree * @return string Processed HTML diff --git a/tests/phpunit/includes/HtmlFormatterTest.php b/tests/phpunit/includes/HtmlFormatterTest.php index 99a6efd659..98eff7b387 100644 --- a/tests/phpunit/includes/HtmlFormatterTest.php +++ b/tests/phpunit/includes/HtmlFormatterTest.php @@ -8,17 +8,23 @@ class HtmlFormatterTest extends MediaWikiTestCase { * @dataProvider getHtmlData * @covers HtmlFormatter::getText */ - public function testTransform( $input, $expected, $callback = false ) { + public function testTransform( $input, $expectedText, $expectedRemoved = array(), $callback = false ) { $input = self::normalize( $input ); $formatter = new HtmlFormatter( HtmlFormatter::wrapHTML( $input ) ); if ( $callback ) { $callback( $formatter ); } - $formatter->filterContent(); + $removedElements = $formatter->filterContent(); $html = $formatter->getText(); + $removed = array(); + foreach ( $removedElements as $removedElement ) { + $removed[] = self::normalize( $formatter->getText( $removedElement ) ); + } + $expectedRemoved = array_map( 'self::normalize', $expectedRemoved ); $this->assertValidHtmlSnippet( $html ); - $this->assertEquals( self::normalize( $expected ), self::normalize( $html ) ); + $this->assertEquals( self::normalize( $expectedText ), self::normalize( $html ) ); + $this->assertEquals( asort( $expectedRemoved ), asort( $removed ) ); } private static function normalize( $s ) { @@ -45,6 +51,7 @@ class HtmlFormatterTest extends MediaWikiTestCase { array( 'Blah', '', + array( 'Blah' ), $removeImages, ), // basic tag removal @@ -52,21 +59,30 @@ class HtmlFormatterTest extends MediaWikiTestCase { '
foo
foo
foo
bar foobar
test
baz', - '
test
baz', + array( + '
foo
', + '
foo
', + '
foo
', + 'bar', + 'foobar', + '
', + ), $removeTags, ), // don't flatten tags that start like chosen ones array( '
foo bar
', 'foo bar', + array(), $flattenSomeStuff, ), // total flattening array( '
bar2
', 'bar2', + array(), $flattenEverything, ), // UTF-8 preservation and security -- 2.20.1